# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)
# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
## HH1 HH2 LN WM1 WM2 WM3 WMINT WM4 WM5 WM6D WM6M WM6Y WM8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 3 1 2 3 92 91 92 19 11 2020 2
## 2 1 4 2 1 4 2 92 91 92 18 11 2020 1
## 3 1 9 4 1 9 4 92 91 92 18 11 2020 1
## 4 1 10 4 1 10 4 92 91 92 18 11 2020 2
## 5 1 11 4 1 11 4 92 91 92 19 11 2020 2
## 6 1 11 5 1 11 5 92 91 92 18 11 2020 2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## # WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## # WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## # WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## # WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## # WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## # WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)
# View the first few rows of the new dataframe
summary(female_df)
## WAGEM MSTATUS HH6 HH7 welevel
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.00
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.00
## Median :20.00 Median :1.000 Median :2.000 Median :3.000 Median :2.00
## Mean :20.92 Mean :1.413 Mean :1.683 Mean :3.404 Mean :2.46
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:3.00
## Max. :47.00 Max. :9.000 Max. :2.000 Max. :6.000 Max. :9.00
## NA's :2026 NA's :11 NA's :11 NA's :11 NA's :524
## insurance ethnicity windex5 CP2
## Min. :1.000 Min. :1.000 Min. :0.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :2.000 Median :1.000
## Mean :1.135 Mean :2.035 Mean :2.494 Mean :1.431
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :9.000 Max. :6.000 Max. :5.000 Max. :9.000
## NA's :524 NA's :864
## HA1 MT4 MT9 MT11
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :2.000 Median :1.000 Median :1.00
## Mean :1.215 Mean :1.664 Mean :1.365 Mean :1.12
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.00
## NA's :524 NA's :524 NA's :2496 NA's :524
# Rename the columns
female_df <- female_df %>%
rename(
age_first_marriage = WAGEM,
marital_status = MSTATUS,
area = HH6,
region = HH7,
education_level = welevel,
health_insurance = insurance,
ethnicity = ethnicity,
wealth_index = windex5,
# contraceptive_decision_maker = CP5, #R
current_contraceptive_use = CP2,
# age_first_sexual_intercourse = SB1, #R
# currently_married = MA1,
# partner_age = MA2, #R
awareness_hiv_aids = HA1,
used_computer_tablet = MT4,
used_internet = MT9,
owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
## age_first_marriage marital_status area
## 2026 11 11
## region education_level health_insurance
## 11 524 524
## ethnicity wealth_index current_contraceptive_use
## 0 0 864
## awareness_hiv_aids used_computer_tablet used_internet
## 524 524 2496
## owns_mobile_phone
## 524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
mutate(
current_contraceptive_use = na_if(current_contraceptive_use, 9),
used_internet = na_if(used_internet, 9),
health_insurance = na_if(health_insurance, 9),
education_level = na_if(education_level, 9),
awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
used_computer_tablet = na_if(used_computer_tablet, 9),
owns_mobile_phone = na_if(owns_mobile_phone, 9),
marital_status = na_if(marital_status, 9)
)
# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)
# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.92 Mean :1.408 Mean :1.683 Mean :3.404
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :3.000 Max. :2.000 Max. :6.000
## NA's :2026 NA's :18 NA's :11 NA's :11
## education_level health_insurance ethnicity wealth_index
## Min. :0.00 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.46 Mean :0.8659 Mean :1.586 Mean :2.615
## 3rd Qu.:3.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :1.0000 Max. :4.000 Max. :5.000
## NA's :525 NA's :525 NA's :1148 NA's :524
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5842 Mean :0.7975 Mean :0.3412
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :885 NA's :541 NA's :532
## used_internet owns_mobile_phone
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000
## Mean :0.6394 Mean :0.8831
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :2501 NA's :529
Access to Media# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))
# In later analysis, use access_to_media instead of the 3 separate variables
# Exporting female_df to a CSV file in the current working directory
#write.csv(female_df, "female_df.csv", row.names = FALSE)
# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "cyan", color = "black") +
theme_minimal() +
ggtitle("Histogram of Age at First Marriage") +
xlab("Age at First Marriage") +
ylab("Frequency")
print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Saving the histogram
#ggsave("histogram_age_first_marriage.png", plot = afm_hist, width = 8, height = 6, dpi = 300)
# # Plotting a boxplot for 'Age at First Marriage'
# ggplot(female_df, aes(y = age_first_marriage)) +
# geom_boxplot(fill = "cyan") +
# labs(title = "Boxplot of Age at First Marriage", y = "Age at First Marriage") +
# theme_minimal()
# # Bar plot for 'used_internet'
# ggplot(female_df, aes(x = used_internet)) +
# geom_bar(fill = "yellow") +
# theme_minimal() +
# ggtitle("Bar Plot of Used Internet") +
# xlab("Used Internet") +
# ylab("Count")
# Before that, let's double check the count of missing values by column
count_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(count_missing)
## age_first_marriage marital_status area
## 2026 18 11
## region education_level health_insurance
## 11 525 525
## ethnicity wealth_index current_contraceptive_use
## 1148 524 885
## awareness_hiv_aids used_computer_tablet used_internet
## 541 532 2501
## owns_mobile_phone access_to_media
## 529 529
# Make a copy of female_df for imputation
imputed_df <- female_df
# Handle missing values in 'Age at First Marriage'
# Calculate the median value for 'Age at First Marriage', excluding NA values
median_age_first_marriage <- median(imputed_df$age_first_marriage, na.rm = TRUE)
# Impute missing values in 'Age at First Marriage' with the median value
imputed_df$age_first_marriage[is.na(imputed_df$age_first_marriage)] <- median_age_first_marriage
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
# The mode is the value that appears most frequently in the data
uniqv <- unique(na.omit(v)) # Omit NA values and get unique values
uniqv[which.max(tabulate(match(v, uniqv)))] # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))
# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).
# Calculate the mode for each binary variable
mode_used_internet <- getMode(imputed_df$used_internet)
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_used_computer_tablet <- getMode(imputed_df$used_computer_tablet)
mode_owns_mobile_phone <- getMode(imputed_df$owns_mobile_phone)
mode_access_to_media <- getMode(imputed_df$access_to_media)
# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
used_internet = ifelse(is.na(used_internet), mode_used_internet, used_internet),
current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
used_computer_tablet = ifelse(is.na(used_computer_tablet), mode_used_computer_tablet, used_computer_tablet),
owns_mobile_phone = ifelse(is.na(owns_mobile_phone), mode_owns_mobile_phone, owns_mobile_phone),
access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)
# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))
# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.76 Mean :1.408 Mean :1.684 Mean :3.403
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :3.000 Max. :2.000 Max. :6.000
## education_level health_insurance ethnicity wealth_index
## Min. :0.000 Min. :0.0000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.00
## Median :2.000 Median :1.0000 Median :1.000 Median :2.00
## Mean :2.438 Mean :0.8721 Mean :1.527 Mean :2.54
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.00
## Max. :5.000 Max. :1.0000 Max. :4.000 Max. :5.00
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.6168 Mean :0.8072 Mean :0.3251
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## used_internet owns_mobile_phone access_to_media
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7192 Mean :0.8886 Mean :0.9071
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
## age_first_marriage marital_status area
## 0 0 0
## region education_level health_insurance
## 0 0 0
## ethnicity wealth_index current_contraceptive_use
## 0 0 0
## awareness_hiv_aids used_computer_tablet used_internet
## 0 0 0
## owns_mobile_phone access_to_media
## 0 0
# Histogram for 'age_first_marriage' before imputation
afm_0 <- ggplot(female_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "lightpink", color = "black", bins = 30) +
theme_light() +
ggtitle("Age at First Marriage Before Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Histogram for 'age_first_marriage' after imputation
afm_imputed <- ggplot(imputed_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "plum", color = "black", bins = 30) +
theme_light() +
ggtitle("Age at First Marriage After Imputation") +
xlab("Age at First Marriage") +
ylab("Frequency")
# Arrange the two plots side by side
grid.arrange(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Arrange the two plots side by side and capture the layout as a grob
combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)
## Warning: Removed 2026 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)
# Calculate total observations
total_obs <- nrow(imputed_df)
# Aggregate counts for each category by region without altering the original 'region' field
counts_df <- aggregate(cbind(ever_married = imputed_df$marital_status %in% c(1, 2),
married_u18 = imputed_df$child_marriage == 1,
married_u16 = imputed_df$child_marriage_u16 == 1) ~ region,
data = imputed_df,
FUN = sum)
# Convert counts to percentages
counts_df$ever_married <- (counts_df$ever_married / total_obs) * 100
counts_df$married_u18 <- (counts_df$married_u18 / total_obs) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_obs) * 100
# Creating the plot with ggplot2, directly labeling regions
p <- ggplot(counts_df, aes(x = factor(region))) +
geom_bar(aes(y = ever_married, fill = "Ever married or cohabited"), stat = "identity") +
geom_bar(aes(y = married_u18, fill = "Married or cohabited before 18 years old"), stat = "identity") +
geom_bar(aes(y = married_u16, fill = "Married or cohabited before 16 years old"), stat = "identity") +
scale_fill_manual(values = c("Ever married or cohabited" = "snow2",
"Married or cohabited before 18 years old" = "grey",
"Married or cohabited before 16 years old" = "black"),
name = "Marital Status") +
labs(x = "Region", y = "Percentage", title = "Prevalence of Child Marriage by Regions Among Females") +
theme_minimal() +
theme(plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain",
"3" = "North Central And Central Coastal", "4" = "Central Highlands",
"5" = "South East", "6" = "Mekong River Delta"))
# Convert ggplot object to plotly for interactive visualization
ggplotly(p)
# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")
# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(
colours = c("deepskyblue", "white", "red2"),
values = scales::rescale(c(-1, 0, 1)),
limits = c(-1, 1),
name="Pearson\nCorrelation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("") +
ylab("") +
ggtitle("Correlation Matrix")
# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)
# # Mapping for 'area'
# area_labels <- c("Urban", "Rural")
# names(area_labels) <- c("1", "2")
#
# # Mapping for 'region'
# region_labels <- c("Red River Delta", "Northern Midlands And Mountain", "North Central And Central Coastal", "Central Highlands", "South East", "Mekong River Delta")
# names(region_labels) <- c("1", "2", "3", "4", "5", "6")
#
# # Mapping for 'education_level'
# education_level_labels <- c("None or Pre-Primary", "Primary", "Lower Primary", "Upper Primary", "Vocational High School", "University/College/Higher")
# names(education_level_labels) <- c("0", "1", "2", "3", "4", "5")
#
# # Mapping for 'ethnicity'
# ethnicity_labels <- c("Kinh and Hoa", "Tay, Thai, Muong, Nung", "Khmer", "Mong") # Add or modify as per your dataset
# names(ethnicity_labels) <- c("1", "2", "3", "4")
#
# # Mapping for 'wealth_index'
# wealth_index_labels <- c("Poorest", "Poor", "Middle", "Rich", "Richest")
# names(wealth_index_labels) <- c("1", "2", "3", "4", "5")
# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)
# Binary variables are already in the correct format and can be used as is
# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)
# Summarize the baseline model
summary(baseline_model)
##
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index +
## health_insurance + current_contraceptive_use + awareness_hiv_aids +
## access_to_media, family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.73391 0.13169 -5.573 2.50e-08 ***
## area2 0.45543 0.07999 5.693 1.25e-08 ***
## education_level1 -0.41350 0.08705 -4.750 2.03e-06 ***
## education_level2 -0.58693 0.08518 -6.890 5.57e-12 ***
## education_level3 -1.52807 0.11502 -13.286 < 2e-16 ***
## education_level4 -3.24812 0.51198 -6.344 2.24e-10 ***
## education_level5 -3.19482 0.25323 -12.616 < 2e-16 ***
## wealth_index2 -0.46183 0.07994 -5.777 7.61e-09 ***
## wealth_index3 -0.80552 0.09964 -8.085 6.24e-16 ***
## wealth_index4 -0.61389 0.10926 -5.618 1.93e-08 ***
## wealth_index5 -0.89341 0.14898 -5.997 2.01e-09 ***
## health_insurance 0.06907 0.08094 0.853 0.393
## current_contraceptive_use 0.40123 0.06065 6.615 3.71e-11 ***
## awareness_hiv_aids -0.47875 0.06960 -6.879 6.05e-12 ***
## access_to_media -0.01461 0.08121 -0.180 0.857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8493.7 on 11279 degrees of freedom
## AIC: 8523.7
##
## Number of Fisher Scoring iterations: 7
# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
base_vif_results <- vif(baseline_model)
print(base_vif_results)
## GVIF Df GVIF^(1/(2*Df))
## area 1.110732 1 1.053913
## education_level 1.707863 5 1.054983
## wealth_index 1.401192 4 1.043067
## health_insurance 1.021120 1 1.010505
## current_contraceptive_use 1.034853 1 1.017277
## awareness_hiv_aids 1.500511 1 1.224954
## access_to_media 1.268704 1 1.126367
# Check if any VIF value is greater than a typical threshold, like 5 or 10.
base_high_vif <- base_vif_results[base_vif_results > 5]
print(base_high_vif)
## numeric(0)
# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media,
family = binomial(),
data = imputed_df)
# Summarize the new model with FEs
summary(enhanced_model)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media, family = binomial(),
## data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.54522 0.17906 -8.630 < 2e-16 ***
## area2 0.28689 0.08488 3.380 0.000725 ***
## region2 0.37758 0.12919 2.923 0.003469 **
## region3 0.18170 0.12745 1.426 0.153967
## region4 0.34057 0.12599 2.703 0.006869 **
## region5 -0.05340 0.12612 -0.423 0.672028
## region6 -0.05277 0.13336 -0.396 0.692301
## education_level1 -0.05925 0.09353 -0.634 0.526382
## education_level2 -0.22571 0.09227 -2.446 0.014438 *
## education_level3 -1.17196 0.12172 -9.629 < 2e-16 ***
## education_level4 -2.96956 0.51385 -5.779 7.51e-09 ***
## education_level5 -2.89574 0.25599 -11.312 < 2e-16 ***
## ethnicity2 0.07771 0.10592 0.734 0.463184
## ethnicity3 0.27807 0.12830 2.167 0.030210 *
## ethnicity4 1.15504 0.10626 10.870 < 2e-16 ***
## wealth_index2 -0.12498 0.08523 -1.466 0.142533
## wealth_index3 -0.46330 0.10565 -4.385 1.16e-05 ***
## wealth_index4 -0.26442 0.11695 -2.261 0.023760 *
## wealth_index5 -0.54064 0.16030 -3.373 0.000744 ***
## health_insurance -0.05157 0.08277 -0.623 0.533244
## current_contraceptive_use 0.48480 0.06265 7.739 1.00e-14 ***
## awareness_hiv_aids -0.19387 0.07592 -2.554 0.010661 *
## access_to_media -0.07880 0.08517 -0.925 0.354845
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8240.2 on 11271 degrees of freedom
## AIC: 8286.2
##
## Number of Fisher Scoring iterations: 7
# Calculate VIF (A VIF value > 5 indicates high multicollinearity)
FE_vif_results <- vif(enhanced_model)
print(FE_vif_results)
## GVIF Df GVIF^(1/(2*Df))
## area 1.254248 1 1.119932
## region 4.916245 5 1.172636
## education_level 1.956030 5 1.069394
## ethnicity 4.235677 3 1.272000
## wealth_index 1.934925 4 1.086008
## health_insurance 1.051524 1 1.025438
## current_contraceptive_use 1.049061 1 1.024237
## awareness_hiv_aids 1.679129 1 1.295812
## access_to_media 1.312005 1 1.145428
# Check if any VIF value is greater than a typical threshold, like 5.
FE_high_vif <- FE_vif_results[FE_vif_results > 5]
print(FE_high_vif)
## numeric(0)
No VIF is significant larger than 5. This suggests that the independent variables in my model with Fixed Effects do not suffer from severe multicollinearity.
# Model comparison using AIC
aic_base <- AIC(baseline_model)
aic_enhanced <- AIC(enhanced_model)
cat("AIC - Base Model:", aic_base, "\n")
## AIC - Base Model: 8523.675
cat("AIC - Enhanced Model:", aic_enhanced, "\n")
## AIC - Enhanced Model: 8286.168
Additionally, the lower AIC for the model with two fixed effects compared to the baseline model suggests that adding these fixed effects improves the model’s overall fit. This aligns with my theoretical justification for including region and ethnicity as relevant factors in predicting child marriage.
# Model comparison using BIC
bic_base <- BIC(baseline_model)
bic_enhanced <- BIC(enhanced_model)
cat("BIC - Base Model:", bic_base, "\n")
## BIC - Base Model: 8633.655
cat("BIC - Enhanced Model:", bic_enhanced, "\n")
## BIC - Enhanced Model: 8454.805
Lower BIC values suggest that, despite the added complexity (more parameters), the model with both region and ethnicity provides a better overall fit to my data when adjusted for the number of predictors. This is a strong indication that these variables are meaningful in explaining the variance in child marriage occurrences in my dataset.
# Adding the interaction term between education level and wealth index
model_interaction <- update(enhanced_model, . ~ . + area:wealth_index)
summary(model_interaction)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:wealth_index,
## family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.78525 0.21262 -8.396 < 2e-16 ***
## area2 0.54558 0.15257 3.576 0.000349 ***
## region2 0.36523 0.13009 2.807 0.004994 **
## region3 0.16778 0.12857 1.305 0.191877
## region4 0.32337 0.12701 2.546 0.010892 *
## region5 -0.06803 0.12726 -0.535 0.592956
## region6 -0.06059 0.13410 -0.452 0.651371
## education_level1 -0.06117 0.09352 -0.654 0.513085
## education_level2 -0.21895 0.09231 -2.372 0.017701 *
## education_level3 -1.17282 0.12176 -9.632 < 2e-16 ***
## education_level4 -2.96880 0.51416 -5.774 7.74e-09 ***
## education_level5 -2.89550 0.25820 -11.214 < 2e-16 ***
## ethnicity2 0.06171 0.10624 0.581 0.561352
## ethnicity3 0.27262 0.12824 2.126 0.033518 *
## ethnicity4 1.12852 0.10687 10.559 < 2e-16 ***
## wealth_index2 0.26715 0.19445 1.374 0.169473
## wealth_index3 -0.15465 0.21122 -0.732 0.464075
## wealth_index4 -0.04152 0.22195 -0.187 0.851622
## wealth_index5 -0.32837 0.26777 -1.226 0.220078
## health_insurance -0.04238 0.08289 -0.511 0.609164
## current_contraceptive_use 0.49215 0.06274 7.844 4.35e-15 ***
## awareness_hiv_aids -0.18696 0.07599 -2.460 0.013883 *
## access_to_media -0.07199 0.08522 -0.845 0.398196
## area2:wealth_index2 -0.47937 0.21523 -2.227 0.025928 *
## area2:wealth_index3 -0.38304 0.24189 -1.584 0.113299
## area2:wealth_index4 -0.26224 0.25631 -1.023 0.306247
## area2:wealth_index5 -0.25049 0.31993 -0.783 0.433646
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10439.1 on 11293 degrees of freedom
## Residual deviance: 8234.7 on 11267 degrees of freedom
## AIC: 8288.7
##
## Number of Fisher Scoring iterations: 7
cat("AIC (base):", AIC(baseline_model), "\nBIC (base):", BIC(baseline_model), "\n")
## AIC (base): 8523.675
## BIC (base): 8633.655
cat("AIC (w/ interaction terms):", AIC(model_interaction), "\nBIC (w/ interaction terms):", BIC(model_interaction), "\n")
## AIC (w/ interaction terms): 8288.662
## BIC (w/ interaction terms): 8486.627
# Function to add significance asterisks
add_asterisks <- function(p_value) {
if (is.na(p_value)) {
return(NA)
} else if (p_value < 0.001) {
return("***")
} else if (p_value < 0.01) {
return("**")
} else if (p_value < 0.05) {
return("*")
} else {
return("")
}
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_enhanced <- tidy_enhanced %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_interaction <- tidy_interaction %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Baseline")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Enhanced")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Interaction")
# Combine and pivot the dataframes
combined_results <- bind_rows(
tidy_baseline %>% select(term, OR, CI, Model),
tidy_enhanced %>% select(term, OR, CI, Model),
tidy_interaction %>% select(term, OR, CI, Model)
) %>%
pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
select(term,
OR_Baseline, CI_Baseline,
OR_Enhanced, CI_Enhanced,
OR_Interaction, CI_Interaction)
# Print the final combined table
print(combined_results)
## # A tibble: 27 × 7
## term OR_Baseline CI_Baseline OR_Enhanced CI_Enhanced OR_Interaction
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 (Intercept) 0.48*** (0.37, 0.6… 0.21*** (0.15, 0.3) 0.17***
## 2 area2 1.58*** (1.35, 1.8… 1.33*** (1.13, 1.5… 1.73***
## 3 education_lev… 0.66*** (0.56, 0.7… 0.94 (0.78, 1.1… 0.94
## 4 education_lev… 0.56*** (0.47, 0.6… 0.8* (0.67, 0.9… 0.8*
## 5 education_lev… 0.22*** (0.17, 0.2… 0.31*** (0.24, 0.3… 0.31***
## 6 education_lev… 0.04*** (0.01, 0.0… 0.05*** (0.02, 0.1… 0.05***
## 7 education_lev… 0.04*** (0.02, 0.0… 0.06*** (0.03, 0.0… 0.06***
## 8 wealth_index2 0.63*** (0.54, 0.7… 0.88 (0.75, 1.0… 1.31
## 9 wealth_index3 0.45*** (0.37, 0.5… 0.63*** (0.51, 0.7… 0.86
## 10 wealth_index4 0.54*** (0.44, 0.6… 0.77* (0.61, 0.9… 0.96
## # ℹ 17 more rows
## # ℹ 1 more variable: CI_Interaction <chr>